get_similarity_scores <- function(x, 
                                  target = "TARGETWORD", 
                                  first_vec, 
                                  second_vec, 
                                  pre_trained, 
                                  transform_matrix,
                                  group_var,
                                  window = window,
                                  norm = "l2",
                                  remove_punct = FALSE, 
                                  remove_symbols = FALSE, 
                                  remove_numbers = FALSE, 
                                  remove_separators = FALSE,
                                  valuetype = "fixed",
                                  hard_cut = FALSE,
                                  case_insensitive = TRUE) {
  
  # Tokenize corpus
  toks <- tokens(x, remove_punct = remove_punct, remove_symbols = remove_symbols, 
                 remove_numbers = remove_numbers, remove_separators = remove_separators)
  
  # Build tokenized corpus of contexts surrounding the target word
  target_toks <- tokens_context(x = toks, pattern = target, 
                                valuetype = valuetype, window = window, 
                                hard_cut = hard_cut, case_insensitive = case_insensitive)
  
  # Compute ALC embeddings
  target_dfm <- dfm(target_toks)
  target_dem <- dem(x = target_dfm, pre_trained = pre_trained, 
                    transform = TRUE, transform_matrix = transform_matrix, 
                    verbose = TRUE)
  
  # Aggregate embeddings over the grouping variable
  target_dem_grouped <- dem_group(target_dem, 
                                  groups = target_dem@docvars[[group_var]])
  print(dim(target_dem_grouped))
  
  # Check if vector is of length >1 to determine if necessary to transpose
  if (length(first_vec) > 1) {
    y_matrix = as.matrix(pre_trained[intersect(first_vec, rownames(pre_trained)),])
  } else {
    y_matrix = t(as.matrix(pre_trained[intersect(first_vec, rownames(pre_trained)),]))
  }
  
  print(dim(y_matrix))
  # Cosine similarity for first vector of terms
  group_first_val <- sim2(target_dem_grouped, 
                          y = y_matrix, 
                          method = 'cosine', norm = norm)
  
  group_first_val <- rowMeans(group_first_val) #TODO not necessary if first/second vec of length 1
  group_first_val <- tibble(group = factor(names(group_first_val)), 
                            first_val = unname(group_first_val))
  
  # Check if vector is of length >1 to determine if necessary to transpose
  if (length(second_vec) > 1) {
    y_matrix = as.matrix(pre_trained[intersect(second_vec, rownames(pre_trained)),])
  } else {
    y_matrix = t(as.matrix(pre_trained[intersect(second_vec, rownames(pre_trained)),]))
  }
  
  # Cosine similarity for negative second vector of terms
  group_sec_val <- sim2(target_dem_grouped, 
                          y = y_matrix, 
                          method = 'cosine', norm = norm)
  
  group_sec_val <- rowMeans(group_sec_val) #TODO not necessary if first/second vec of length 1
  group_sec_val <- tibble(group = factor(names(group_sec_val)), 
                          sec_val = unname(group_sec_val))
  
  result <- left_join(group_first_val, group_sec_val, by = "group") %>% 
    mutate(val = (1)*first_val + (-1)*sec_val) %>%
    select(group, val)
  
  return(result)
}


get_similarity_scores2 <- function(target_toks_file = target_toks,
                                  target = "TARGETWORD", 
                                  first_vec, 
                                  second_vec, 
                                  pre_trained, 
                                  transform_matrix,
                                  group_var,
                                  window = window,
                                  norm = "l2",
                                  remove_punct = FALSE, 
                                  remove_symbols = FALSE, 
                                  remove_numbers = FALSE, 
                                  remove_separators = FALSE,
                                  valuetype = "fixed",
                                  hard_cut = FALSE,
                                  case_insensitive = TRUE) {
  
  target_toks <- readRDS(target_toks_file)
  
  # Compute ALC embeddings
  target_dfm <- dfm(target_toks)
  target_dem <- dem(x = target_dfm, pre_trained = pre_trained, 
                    transform = TRUE, transform_matrix = transform_matrix, 
                    verbose = TRUE)
  
  # Aggregate embeddings over the grouping variable
  target_dem_grouped <- dem_group(target_dem, 
                                  groups = target_dem@docvars[[group_var]])
  print(dim(target_dem_grouped))
  
  # Check if vector is of length >1 to determine if necessary to transpose
  if (length(first_vec) > 1) {
    y_matrix = as.matrix(pre_trained[intersect(first_vec, rownames(pre_trained)),])
  } else {
    y_matrix = t(as.matrix(pre_trained[intersect(first_vec, rownames(pre_trained)),]))
  }
  
  print(dim(y_matrix))
  # Cosine similarity for first vector of terms
  group_first_val <- sim2(target_dem_grouped, 
                          y = y_matrix, 
                          method = 'cosine', norm = norm)
  
  group_first_val <- rowMeans(group_first_val) #TODO not necessary if first/second vec of length 1
  group_first_val <- tibble(group = factor(names(group_first_val)), 
                            first_val = unname(group_first_val))
  
  # Check if vector is of length >1 to determine if necessary to transpose
  if (length(second_vec) > 1) {
    y_matrix = as.matrix(pre_trained[intersect(second_vec, rownames(pre_trained)),])
  } else {
    y_matrix = t(as.matrix(pre_trained[intersect(second_vec, rownames(pre_trained)),]))
  }
  
  # Cosine similarity for negative second vector of terms
  group_sec_val <- sim2(target_dem_grouped, 
                        y = y_matrix, 
                        method = 'cosine', norm = norm)
  
  group_sec_val <- rowMeans(group_sec_val) #TODO not necessary if first/second vec of length 1
  group_sec_val <- tibble(group = factor(names(group_sec_val)), 
                          sec_val = unname(group_sec_val))
  
  result <- left_join(group_first_val, group_sec_val, by = "group") %>% 
    mutate(val = (1)*first_val + (-1)*sec_val) %>%
    select(group, val)
  
  return(result)
}